rm(list=ls()) 
setwd("")#working directory
options(scipen=999)
options(max.print=1000000)

install.packages("readr")
install.packages("ggplot2")
install.packages("translateR")
install.packages("stringr")
install.packages("splitstackshape")
install.packages("cowplot")
install.packages("dplyr")
install.packages('Jmisc')
install.packages('psych')
install.packages('qwraps2')
install.packages("multiwayvcov")
install.packages("tm")
install.packages("RTextTools")
install.packages("qdap")
install.packages("qdapDictionaries")
install.packages("SnowballC")
install.packages("stm")

library(translateR);library(readr);library(ggplot2);library(stringr);library(plyr);
library(splitstackshape);library(data.table);library(dplyr);library(cowplot);library(Jmisc);library(psych);library(qwraps2);library(xtable);library(sandwich);library(stargazer);library(lmtest);library(ggpubr);library(multiwayvcov)
library(wordcloud);library(stm);library(readxl)
library(ggplotify);library(ggraph);library(tidygraph)
require(tm) 
require(RTextTools) 
require(qdap) 
require(qdapDictionaries)
require(SnowballC) 
library(tidyverse)

##start 
data <- read_csv("data_file2.csv")#load data with anonymized processed "text" 
data$my_station <- as.factor(data$my_station)
data <- data.frame(data)
data.frame(colnames(data))

##
male_complaint <- data[ with(data, grepl("other", data$complainant_gender)),]
fem_complaint <- data[ with(data, grepl("female", data$complainant_gender)),]
nongendered <- data[ with(data, grepl(0, data$gendered)),]
gendered <- data[ with(data, grepl(1, data$gendered)),]


###########
library(Rtsne)
library(rsvd)
library(geometry)

###########
set.seed(23456)
processed <- textProcessor(documents = data$text, metadata = data)

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta, lower.thresh = 50)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

##
test2 <- stm(documents = out$documents, vocab = out$vocab,
             K = 32, prevalence =~ complainant_gender + gendered + urban + convicted + acquitted + dismissed,
             max.em.its = 75,
             data = out$meta, init.type = "Spectral")

#
topicnames <- c("(1) UNLICENSED:", "(2) CURRENCY:", "(3) DOWRY-A:", "(4) PHISHING:", "(5) CATTLE:", "(6) RESOURCE MAFIA:","(7) MISSING PERSON:", "(8) DOWRY-B:","(9) RAILWAY:","(10) ACCIDENT/ATTACK:","(11) LEWD BEHAVIOR:","(12) DEVELOPMENT:","(13) INJURY:","(14) DRIVING MISDEMEANOR:", "(15) FUGITIVE:","(16) BURGLARY:","(17) FIGHTING:","(18) ARMS:","(19) ALCOHOL:","(20) PROPERTY:", "(21) DRIVING ACCIDENT:","(22) AUTO THEFT-A:","(23) AUTO THEFT-B:","(24) CHEAT:","(25) MINORITIES:","(26) PHONE THEFT:","(27) KIDNAPPING:","(28) GAMBLING:","(29) CHAIN-SNATCH:","(30) ELECTRICTY THEFT:","(31) DRUGS:","(32) REAL ESTATE:")

#**FIGURE**
plot(test2, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Top Topics", topic.names = topicnames)
plot(test2, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Top FREX", labeltype = "frex",topic.names = topicnames)

figure1 <- as.ggplot(~plot(test2, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Top Topics", topic.names = topicnames))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines"))
figure2 <- as.ggplot(~plot(test2, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Top FREX", topic.names = topicnames, labeltype = "frex"))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines"))

##Figure A29
my_plot_list <- list(figure1,figure2)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,labels = c("A)", "B)"))

#
prep1 <- estimateEffect(1:32 ~ complainant_gender, test2, meta = out$meta, uncertainty = "Global")
prep2 <- estimateEffect(1:32 ~ gendered, test2, meta = out$meta, uncertainty = "Global")
prep3 <- estimateEffect(1:32 ~ urban, test2, meta = out$meta, uncertainty = "Global")
prep4 <- estimateEffect(1:32 ~ convicted, test2, meta = out$meta, uncertainty = "Global")
prep5 <- estimateEffect(1:32 ~ acquitted, test2, meta = out$meta, uncertainty = "Global")
prep6 <- estimateEffect(1:32 ~ dismissed, test2, meta = out$meta, uncertainty = "Global")


#
topicnames <- c("(1) UNLICENSED", "(2) CURRENCY", "(3) DOWRY-A", "(4) PHISHING", "(5) CATTLE", "(6) RESOURCE MAFIA","(7) MISSING PERSON", "(8) DOWRY-B","(9) RAILWAY","(10) ACCIDENT/ATTACK","(11) LEWD BEHAVIOR","(12) DEVELOPMENT","(13) INJURY","(14) DRIVING MISDEMEANOR", "(15) FUGITIVE","(16) BURGLARY","(17) FIGHTING","(18) ARMS","(19) ALCOHOL","(20) PROPERTY", "(21) DRIVING ACCIDENT","(22) AUTO THEFT-A","(23) AUTO THEFT-B","(24) CHEAT","(25) MINORITIES","(26) PHONE THEFT","(27) KIDNAPPING","(28) GAMBLING","(29) CHAIN-SNATCH","(30) ELECTRICTY THEFT","(31) DRUGS","(32) REAL ESTATE")

#Figures A30-A32
plot(prep1, "complainant_gender", model=test2, method="difference",cov.value1="female",cov.value2="other", 
     xlab = "Male ... Female", main = "Male/Female Complainants", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(prep2, "gendered", model=test2, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "More Nongendered ... More Gendered", main = "Nongendered/Gendered Crime", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(prep3, "urban", model=test2, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "Rural ... Urban", main = "Rural/Urban", ci.level = 0.95, verbose.labels = F,custom.labels=topicnames,labeltype="custom")

plot(prep4, "convicted", model=test2, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "Not Convicted ... Convicted", main = "Conviction", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom", xlim = c(-.10,.15))

plot(prep5, "acquitted", model=test2, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "Unacquitted ... Acquitted", main = "Unacquitted/Acquitted", ci.level = 0.95, verbose.labels = F,custom.labels=topicnames,labeltype="custom")

plot(prep6, "dismissed", model=test2, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "Not Dismissed ... Dismissed", main = "Not Dismissed/Dismissed", ci.level = 0.95, verbose.labels = F,custom.labels=topicnames,labeltype="custom")

##
labelTopics(test2)

#
word_table <- matrix(rep(NA, 64),nrow=64, ncol=2)
for(i in 1:32){
  word_table[i*2-1, 1] <- topicnames[i]
  high_prob <- paste(labelTopics(test2)$prob[i,], 
                     sep=", ", collapse=", ")
  frex <- paste(labelTopics(test2)$frex[i,], 
                sep=", ", collapse=", ")
  word_table[i*2-1,2] <- paste("Highest Prob: ", high_prob, sep="")
  word_table[i*2, 2] <- paste("FREX: ", frex, sep="")
}
colnames(word_table) <- c("Topic", "Top Words")

#**TABLE**Table A9
print(xtable(word_table), include.rownames=FALSE)

## 
cloud1 <- as.ggplot(~cloud(test2, topic = 1, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud2 <- as.ggplot(~cloud(test2, topic = 2, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud3 <- as.ggplot(~cloud(test2, topic = 3, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud4 <- as.ggplot(~cloud(test2, topic = 4, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud5 <- as.ggplot(~cloud(test2, topic = 5, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud6 <- as.ggplot(~cloud(test2, topic = 6, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud7 <- as.ggplot(~cloud(test2, topic = 7, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud8 <- as.ggplot(~cloud(test2, topic = 8, scale = c(3, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud9 <- as.ggplot(~cloud(test2, topic = 9, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud10 <- as.ggplot(~cloud(test2, topic = 10, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud11 <- as.ggplot(~cloud(test2, topic = 11, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud12 <- as.ggplot(~cloud(test2, topic = 12, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud13 <- as.ggplot(~cloud(test2, topic = 13, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud14 <- as.ggplot(~cloud(test2, topic = 14, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud15 <- as.ggplot(~cloud(test2, topic = 15, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud16 <- as.ggplot(~cloud(test2, topic = 16, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud17 <- as.ggplot(~cloud(test2, topic = 17, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud18 <- as.ggplot(~cloud(test2, topic = 18, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud19 <- as.ggplot(~cloud(test2, topic = 19, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud20 <- as.ggplot(~cloud(test2, topic = 20, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud21 <- as.ggplot(~cloud(test2, topic = 21, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud22 <- as.ggplot(~cloud(test2, topic = 22, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud23 <- as.ggplot(~cloud(test2, topic = 23, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud24 <- as.ggplot(~cloud(test2, topic = 24, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud25 <- as.ggplot(~cloud(test2, topic = 25, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud26 <- as.ggplot(~cloud(test2, topic = 26, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud27 <- as.ggplot(~cloud(test2, topic = 27, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud28 <- as.ggplot(~cloud(test2, topic = 28, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud29 <- as.ggplot(~cloud(test2, topic = 29, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud30 <- as.ggplot(~cloud(test2, topic = 30, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud31 <- as.ggplot(~cloud(test2, topic = 31, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud32 <- as.ggplot(~cloud(test2, topic = 32, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))


##**FIGURES**Figure A32-35
my_plot_list <- list(cloud1,cloud2,cloud3,cloud4, cloud5, cloud6, cloud7, cloud8)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("1", "2", "3", "4","5","6","7","8"))

my_plot_list <- list(cloud9,cloud10,cloud11,cloud12, cloud13, cloud14, cloud15, cloud16)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("9", "10", "11", "12","13","14","15","16"))

my_plot_list <- list(cloud17,cloud18,cloud19,cloud20, cloud21, cloud22, cloud23, cloud24)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("17", "18", "19", "20","21","22","23","24"))

my_plot_list <- list(cloud25,cloud26,cloud27,cloud28, cloud29, cloud30, cloud31, cloud32)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("25", "26", "27", "28","29","30","31","32"))


#Thanks to Chris Lucas
mod.out.corr <- topicCorr(test2)
plot(mod.out.corr)

prep <- plot.estimateEffect(prep4, "convicted", model=test2, method="difference",cov.value1=1,cov.value2=0)

## 
library(igraph)
cor(test2$theta)
mat2 <- cor(test2$theta)
mat2[mat2<0] <- 0
diag(mat2) <- 0
mat2
out <- mat2
g <- graph.adjacency(out, mode="undirected", weighted=T)
cor(test2$theta)[,1]
E(g)
edges <- get.edgelist(g)
edgecors <- rep(NA,nrow(edges))
for(i in 1:nrow(edges)){
  edgecors[i] <- cor(test2$theta)[edges[i,1],edges[i,2]]
}
edge.width=35*edgecors
E(g)$weight
E(g)$size <- 1
E(g)$lty <- 1
E(g)$color <- "black"
est <- unlist(lapply(prep$means,function(x){return(x[1])}))
table(est)
mycols <- rev(colorRampPalette(c("red", "white", "blue"), bias=1)( 20 )) ## (n)
seq(-.1,.1,length.out=20)
colcat <- rep(NA,length(est))
for(i in 1:length(colcat)){
  colcat[i] <- max(which(est[i] > seq(-.1,.1,length.out=20)))
}
colcat
mycols[colcat]
plot(est,1:32,pch=19,cex=2,col=mycols[colcat]);abline(v=0)
V(g)$label=topicnames
vertex.color = mycols[colcat]
vertex.label.cex = 0.7
vertex.label.color = "black"
edge.color = "gray60"
wts <- E(g)$weight
mylayout <- layout.fruchterman.reingold(g,weight=wts)


##**FIGURE**
figure1 <- as.ggplot(~plot(prep4, "convicted", model=test2, method="difference",cov.value1="1",cov.value2="0", 
                           xlab = "Not Convicted ... Convicted", main = "All Crime Conviction", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom", xlim = c(-.16,.15)))+ theme(plot.margin = unit(c(-4,-3,-7,-5), "lines"))

figure2 <- as.ggplot(~plot(g, layout=mylayout,edge.color=edge.color,vertex.color=vertex.color, vertex.label.cex=vertex.label.cex, vertex.label.color=vertex.label.color,edge.width=edge.width, main = "Correlation", ylim=c(-0.9,0.9),xlim=c(-0.9,1.1), asp = 0)) + theme(plot.margin = unit(c(-4,-3,-7,-2), "lines"))

my_plot_list <- list(figure1,figure2)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,labels = c("A)", "B)"))


##**FIGURE**5
figure1 <- as.ggplot(~plot(prep4, "convicted", model=test2, method="difference",cov.value1="1",cov.value2="0", 
                           xlab = "Not Convicted ... Convicted", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom", xlim = c(-.16,.16),cex.lab = 1.5, cex.axis = 1.5, xaxs="i"))+ theme(plot.margin = unit(c(-4,-3,-7,-5), "lines"))

figure2 <- ggraph(g, layout = "fr") +
  geom_edge_link(aes(edge_alpha = edge.width, edge_width = edge.width), edge_colour = "gray", show.legend = FALSE) +
  geom_node_point(aes(fill = colcat), shape = 21, size = 8, color = "white", alpha=0.9, show.legend = FALSE) +
  theme_void() +
  geom_node_text(aes(label = label), repel = TRUE, colour="black") +
  scale_fill_gradient2(low = "blue",  mid = "darkgray", high = "brown", midpoint = 10)


my_plot_list <- list(figure1,figure2)#Fig 5
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,labels = c("A)", "B)"),font.label = list(size = 20))


#########################################################################################################
set.seed(23456)
processed <- textProcessor(documents = fem_complaint$text, metadata = fem_complaint)

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta, lower.thresh = 50)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

#
female_data <- stm(documents = out$documents, vocab = out$vocab,
                   K = 24, prevalence =~ gendered + urban + convicted + acquitted + dismissed,
                   max.em.its = 75,
                   data = out$meta, init.type = "Spectral")

#
topicnames <- c("(1) VILLAGE PROBLEM:", "(2) POISONING:", "(3) CHEAT:", "(4) UNLICENSED:", "(5) DOWRY-A:", "(6) DOWRY-B:", "(7) REAL ESTATE:", "(8) DEVELOPMENT:", "(9) DOWRY-C:", "(10) CHILD ABUSE/RAPE:", "(11) BURGLARY:", "(12) DOMESTIC VIOLENCE:", "(13) DOWRY-D:", "(14) FIGHTING:", "(15) CHAIN-SNATCH:", "(16) CRIMINAL FORCE:", "(17) RAPE:", "(18) RUNAWAY/SUICIDE:", "(19) MISSING PERSON:", "(20) MISCELLANEOUS:", "(21) PHISHING:", "(22) DRIVING ACCIDENT:", "(23) DOWRY-E:", "(24) ALCOHOL:")

#**FIGURE**
plot(female_data, n=6, type = "summary", xlim = c(0, 0.35), main = "Female Complainant Top Topics", topic.names = topicnames, text.cex = 2, cex.lab = 2, cex.axis = 2, cex.main = 3, xaxs="i", yaxs="i") 
plot(female_data, n=6, type = "summary", xlim = c(0, 0.35), main = "Female Complainant Top FREX", labeltype = "frex",topic.names = topicnames)#What xaxs does is reduce whitespace

figure1 <- as.ggplot(~plot(female_data, n=6, type = "summary", xlim = c(0, 1), main = "Female Complainant Top Topics", topic.names = topicnames, text.cex=1.8, cex.lab = 1.5, cex.axis = 2, cex.main = 3,xaxs="i"))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines")) 
figure2 <- as.ggplot(~plot(female_data, n=6, type = "summary", xlim = c(0, 1), main = "Female Complainant Top FREX", topic.names = topicnames, text.cex=1.8, cex.lab = 1.5, cex.axis = 2, cex.main = 3, xaxs="i",labeltype = "frex"))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines"))

##**FIGURES**
my_plot_list <- list(figure1,figure2)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,font.label = list(size = 26), labels = c("A)", "B)"))

##
rep2 <- estimateEffect(1:24 ~ gendered, female_data, meta = out$meta, uncertainty = "Global")
rep3 <- estimateEffect(1:24 ~ urban, female_data, meta = out$meta, uncertainty = "Global")
rep4 <- estimateEffect(1:24 ~ convicted, female_data, meta = out$meta, uncertainty = "Global")
rep5 <- estimateEffect(1:24 ~ acquitted, female_data, meta = out$meta, uncertainty = "Global")
rep6 <- estimateEffect(1:24 ~ dismissed, female_data, meta = out$meta, uncertainty = "Global")

#
topicnames <- c("(1) VILLAGE PROBLEM", "(2) POISONING", "(3) CHEAT", "(4) UNLICENSED", "(5) DOWRY-A", "(6) DOWRY-B", "(7) REAL ESTATE", "(8) DEVELOPMENT", "(9) DOWRY-C", "(10) CHILD ABUSE/RAPE", "(11) BURGLARY", "(12) DOMESTIC VIOLENCE", "(13) DOWRY-D", "(14) FIGHTING", "(15) CHAIN-SNATCH", "(16) CRIMINAL FORCE", "(17) RAPE", "(18) RUNAWAY/SUICIDE", "(19) MISSING PERSON", "(20) MISCELLANEOUS", "(21) PHISHING", "(22) DRIVING ACCIDENT", "(23) DOWRY-E", "(24) ALCOHOL")

#Figure A36
plot(rep3, "urban", model=female_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Urban", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(rep5, "acquitted", model=female_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Acquitted", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(rep6, "dismissed", model=female_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Dismissed", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

#**FIGURE*
plot(rep4, "convicted", model=female_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Conviction", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

##
labelTopics(female_data)

#
word_table <- matrix(rep(NA, 48),nrow=48, ncol=2)
for(i in 1:24){
  word_table[i*2-1, 1] <- topicnames[i]
  high_prob <- paste(labelTopics(female_data)$prob[i,], 
                     sep=", ", collapse=", ")
  frex <- paste(labelTopics(female_data)$frex[i,], 
                sep=", ", collapse=", ")
  word_table[i*2-1,2] <- paste("Highest Prob: ", high_prob, sep="")
  word_table[i*2, 2] <- paste("FREX: ", frex, sep="")
}
colnames(word_table) <- c("Topic", "Top Words")

#**TABLE**Table A10
print(xtable(word_table), include.rownames=FALSE)

##
fcloud1 <- as.ggplot(~cloud(female_data, topic = 1, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud2 <- as.ggplot(~cloud(female_data, topic = 2, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud3 <- as.ggplot(~cloud(female_data, topic = 3, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud4 <- as.ggplot(~cloud(female_data, topic = 4, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud5 <- as.ggplot(~cloud(female_data, topic = 5, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud6 <- as.ggplot(~cloud(female_data, topic = 6, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud7 <- as.ggplot(~cloud(female_data, topic = 7, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud8 <- as.ggplot(~cloud(female_data, topic = 8, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud9 <- as.ggplot(~cloud(female_data, topic = 9, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud10 <- as.ggplot(~cloud(female_data, topic = 10, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud11 <- as.ggplot(~cloud(female_data, topic = 11, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud12 <- as.ggplot(~cloud(female_data, topic = 12, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud13 <- as.ggplot(~cloud(female_data, topic = 13, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud14 <- as.ggplot(~cloud(female_data, topic = 14, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud15 <- as.ggplot(~cloud(female_data, topic = 15, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud16 <- as.ggplot(~cloud(female_data, topic = 16, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud17 <- as.ggplot(~cloud(female_data, topic = 17, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud18 <- as.ggplot(~cloud(female_data, topic = 19, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud19 <- as.ggplot(~cloud(female_data, topic = 19, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud20 <- as.ggplot(~cloud(female_data, topic = 20, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud21 <- as.ggplot(~cloud(female_data, topic = 21, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud22 <- as.ggplot(~cloud(female_data, topic = 22, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud23 <- as.ggplot(~cloud(female_data, topic = 23, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
fcloud24 <- as.ggplot(~cloud(female_data, topic = 24, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))


##**FIGURES**Figures A37-39
my_plot_list <- list(fcloud1,fcloud2,fcloud3,fcloud4, fcloud5, fcloud6, fcloud7, fcloud8)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("1", "2", "3", "4","5","6","7","8"))

my_plot_list <- list(fcloud9,fcloud10,fcloud11,fcloud12, fcloud13, fcloud14, fcloud15, fcloud16)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("9", "10", "11", "12","13","14","15","16"))

my_plot_list <- list(fcloud17,fcloud18,fcloud19,fcloud20, fcloud21, fcloud22, fcloud23, fcloud24)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("17", "18", "19", "20", "21","22","23","24"))

#
mod.out.corr <- topicCorr(female_data)
plot(mod.out.corr)

fprep <- plot.estimateEffect(rep4, "convicted", model=female_data, method="difference",cov.value1=1,cov.value2=0)

library(igraph)
cor(female_data$theta)

mat2 <- cor(female_data$theta)
mat2[mat2<0] <- 0
diag(mat2) <- 0
mat2
out <- mat2
g <- graph.adjacency(out, mode="undirected", weighted=T)
cor(female_data$theta)[,1]
E(g)
edges <- get.edgelist(g)
edgecors <- rep(NA,nrow(edges))
for(i in 1:nrow(edges)){
  edgecors[i] <- cor(female_data$theta)[edges[i,1],edges[i,2]]
}
edge.width=35*edgecors
E(g)$weight
E(g)$size <- 1
E(g)$lty <- 1
E(g)$color <- "black"
est <- unlist(lapply(fprep$means,function(x){return(x[1])}))#Input the plot "fprep" here
table(est)
mycols <- rev(colorRampPalette(c("red", "white", "blue"), bias=1)( 20 )) ## (n)
seq(-.1,.1,length.out=20)
colcat <- rep(NA,length(est))
for(i in 1:length(colcat)){
  colcat[i] <- max(which(est[i] > seq(-.1,.1,length.out=20)))
}
colcat
mycols[colcat]
plot(est,1:24,pch=19,cex=1,col=mycols[colcat]);abline(v=0)
V(g)$label=topicnames
vertex.color = mycols[colcat]
vertex.label.cex = 1
vertex.label.color = "black"
edge.color = "gray60"
wts <- E(g)$weight
mylayout <- layout.fruchterman.reingold(g,weight=wts)
V(g)$size <- 12

plot(g, layout=mylayout,edge.color=edge.color,vertex.color=vertex.color, vertex.label.cex=vertex.label.cex, vertex.label.color=vertex.label.color,edge.width=edge.width, ylim=c(-0.9,0.9),xlim=c(-0.9,1.1), asp = -1)


##**FIGURE**8
figure1 <- as.ggplot(~plot(rep4, "convicted", model=female_data, method="difference",cov.value1="1",cov.value2="0", 
                           xlab = "Not Convicted ... Convicted", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom", xlim = c(-.085,.055),cex.lab = 1.5, cex.axis = 1.5, xaxs="i"))+ theme(plot.margin = unit(c(-4,-3,-7,-5), "lines"))

figure2 <- ggraph(g, layout = "fr") +
  geom_edge_link(aes(edge_alpha = edge.width, edge_width = edge.width), edge_colour = "gray", show.legend = FALSE) +
  geom_node_point(aes(fill = colcat), shape = 21, size = 8, color = "white", alpha=0.9, show.legend = FALSE) +
  theme_void() +
  geom_node_text(aes(label = label), repel = TRUE, colour="black") +
  scale_fill_gradient2(low = "blue",  mid = "darkgray", high = "brown", midpoint = 10)


my_plot_list <- list(figure1,figure2)#FIG 8
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1)

ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,labels = c("A)", "B)"),font.label = list(size = 20))


#########################################################################################################
set.seed(23456)
processed <- textProcessor(documents = gendered$text, metadata = gendered)

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta, lower.thresh = 50)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

#
gendered_data <- stm(documents = out$documents, vocab = out$vocab,
                     K = 20, prevalence =~ urban + convicted + acquitted + dismissed,
                     max.em.its = 75,
                     data = out$meta, init.type = "Spectral")


#
topicnames <- c("(1) DOWRY-MENTAL:", "(2) DOWRY-PHYSICAL:", "(3) DOWRY-PREGNANCY:", "(4) DOWRY-ECONOMIC:", "(5) UNLICENSED (SEX SELECTION):", "(6) DOWRY-RAPE:", "(7) KILLING GIRL CHILD:", "(8) DOWRY DEATH:", "(9) ALCOHOL:", "(10) HURT/DOMESTIC VIOLENCE:", "(11) KIDNAPPING:", "(12) TRAFFICKING:", "(13) BLACKMAIL:", "(14) SEX SELECTION/ABORTION:", "(15) DOWRY-EXTENDED:", "(16) DOWRY-POST COUNSELING:", "(17) RAPE:", "(18) LEWD PHOTOS:", "(19) DOWRY-DESERTION:", "(20) DOWRY-STARVATION:")

#**FIGURE**
plot(gendered_data, n=6, type = "summary", xlim = c(0, 0.45), main = "VAW Top Topics", topic.names = topicnames,text.cex = 2, cex.lab = 2, cex.axis = 2, cex.main = 3, xaxs="i", yaxs="i") 
plot(gendered_data, n=6, type = "summary", xlim = c(0, 0.45), main = "VAW Top FREX", labeltype = "frex",topic.names = topicnames)

figure1 <- as.ggplot(~plot(gendered_data, n=6, type = "summary", xlim = c(0, 1), main = "VAW Top Topics", topic.names = topicnames,text.cex=1.8, cex.lab = 1.5, cex.axis = 2, cex.main = 3,xaxs="i"))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines")) 
figure2 <- as.ggplot(~plot(gendered_data, n=6, type = "summary", xlim = c(0, 1), main = "VAW Top FREX", topic.names = topicnames, text.cex=1.8, cex.lab = 1.5, cex.axis = 2, cex.main = 3,xaxs="i",labeltype = "frex"))+ theme(plot.margin = unit(c(-3,-3,-5,-6), "lines"))

##**FIGURES**Figure 7
my_plot_list <- list(figure1,figure2)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,font.label = list(size = 26),labels = c("A)", "B)"))


#
topicnames <- c("(1) DOWRY-MENTAL", "(2) DOWRY-PHYSICAL", "(3) DOWRY-PREGNANCY", "(4) DOWRY-ECONOMIC", "(5) UNLICENSED (SEX SELECTION)", "(6) DOWRY-RAPE", "(7) KILLING GIRL CHILD", "(8) DOWRY DEATH", "(9) ALCOHOL", "(10) HURT/DOMESTIC VIOLENCE", "(11) KIDNAPPING", "(12) TRAFFICKING", "(13) BLACKMAIL", "(14) SEX SELECTION/ABORTION", "(15) DOWRY-EXTENDED", "(16) DOWRY-POST COUNSELING", "(17) RAPE", "(18) LEWD PHOTOS", "(19) DOWRY-DESERTION", "(20) DOWRY-STARVATION")

#
ep1 <- estimateEffect(1:20 ~ complainant_gender, gendered_data, meta = out$meta, uncertainty = "Global")
ep3 <- estimateEffect(1:20 ~ urban, gendered_data, meta = out$meta, uncertainty = "Global")
ep4 <- estimateEffect(1:20 ~ convicted, gendered_data, meta = out$meta, uncertainty = "Global")
ep5 <- estimateEffect(1:20 ~ acquitted, gendered_data, meta = out$meta, uncertainty = "Global")
ep6 <- estimateEffect(1:20 ~ dismissed, gendered_data, meta = out$meta, uncertainty = "Global")


##Figure A40
plot(ep1, "complainant_gender", model=gendered_data, method="difference",cov.value1="female",cov.value2="other", 
     xlab = "Male ... Female", main = "Male/Female Complainants", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(ep3, "urban", model=gendered_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Urban", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")
#Figure A41
plot(ep5, "acquitted", model=gendered_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Acquitted", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

plot(ep6, "dismissed", model=gendered_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Dismissed", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")


#**FIGURE*
plot(ep4, "convicted", model=gendered_data, method="difference",cov.value1="1",cov.value2="0", 
     xlab = "", main = "Conviction", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom")

#
word_table <- matrix(rep(NA, 40),nrow=40, ncol=2)
for(i in 1:20){
  word_table[i*2-1, 1] <- topicnames[i]
  high_prob <- paste(labelTopics(gendered_data)$prob[i,], 
                     sep=", ", collapse=", ")
  frex <- paste(labelTopics(gendered_data)$frex[i,], 
                sep=", ", collapse=", ")
  word_table[i*2-1,2] <- paste("Highest Prob: ", high_prob, sep="")
  word_table[i*2, 2] <- paste("FREX: ", frex, sep="")
}
colnames(word_table) <- c("Topic", "Top Words")

#**TABLE**Table A11
print(xtable(word_table), include.rownames=FALSE)

#
cloud1 <- as.ggplot(~cloud(gendered_data, topic = 1, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud2 <- as.ggplot(~cloud(gendered_data, topic = 2, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud3 <- as.ggplot(~cloud(gendered_data, topic = 3, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud4 <- as.ggplot(~cloud(gendered_data, topic = 4, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud5 <- as.ggplot(~cloud(gendered_data, topic = 5, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud6 <- as.ggplot(~cloud(gendered_data, topic = 6, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud7 <- as.ggplot(~cloud(gendered_data, topic = 7, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud8 <- as.ggplot(~cloud(gendered_data, topic = 8, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud9 <- as.ggplot(~cloud(gendered_data, topic = 9, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud10 <- as.ggplot(~cloud(gendered_data, topic = 10, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud11 <- as.ggplot(~cloud(gendered_data, topic = 11, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud12 <- as.ggplot(~cloud(gendered_data, topic = 12, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud13 <- as.ggplot(~cloud(gendered_data, topic = 13, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud14 <- as.ggplot(~cloud(gendered_data, topic = 14, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud15 <- as.ggplot(~cloud(gendered_data, topic = 15, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud16 <- as.ggplot(~cloud(gendered_data, topic = 16, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud17 <- as.ggplot(~cloud(gendered_data, topic = 17, scale = c(6, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud18 <- as.ggplot(~cloud(gendered_data, topic = 18, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud19 <- as.ggplot(~cloud(gendered_data, topic = 19, scale = c(4, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))
cloud20 <- as.ggplot(~cloud(gendered_data, topic = 20, scale = c(5, .90), max.words = 40)) + theme(plot.margin = unit(c(-5,-5,-5,-5), "lines"))


##**FIGURES**Figures A42-44
my_plot_list <- list(cloud1,cloud2,cloud3,cloud4, cloud5, cloud6, cloud7, cloud8)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("1", "2", "3", "4","5","6","7","8"))

my_plot_list <- list(cloud9,cloud10,cloud11,cloud12, cloud13, cloud14, cloud15, cloud16)
ggarrange(plotlist = my_plot_list, ncol = 4,nrow=2,labels = c("9", "10", "11", "12","13","14","15","16"))

my_plot_list <- list(cloud17,cloud18,cloud19,cloud20)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=2,labels = c("17", "18", "19", "20"))


#
mod.out.corr <- topicCorr(gendered_data)
plot(mod.out.corr)

prep <- plot.estimateEffect(ep4, "convicted", model=gendered_data, method="difference",cov.value1=1,cov.value2=0)

library(igraph)
cor(gendered_data$theta)
mat2 <- cor(gendered_data$theta)
mat2[mat2<0] <- 0
diag(mat2) <- 0
mat2
out <- mat2
g <- graph.adjacency(out, mode="undirected", weighted=T)
cor(gendered_data$theta)[,1]
E(g)
edges <- get.edgelist(g)
edgecors <- rep(NA,nrow(edges))
for(i in 1:nrow(edges)){
  edgecors[i] <- cor(gendered_data$theta)[edges[i,1],edges[i,2]]
}
edge.width=35*edgecors
E(g)$weight
E(g)$size <- 1
E(g)$lty <- 1
E(g)$color <- "black"
est <- unlist(lapply(prep$means,function(x){return(x[1])}))
table(est)
mycols <- rev(colorRampPalette(c("red", "white", "blue"), bias=1)( 20 )) 
seq(-.1,.1,length.out=20)
colcat <- rep(NA,length(est))
for(i in 1:length(colcat)){
  colcat[i] <- max(which(est[i] > seq(-.1,.1,length.out=20)))
}
colcat
mycols[colcat]
plot(est,1:20,pch=19,cex=2,col=mycols[colcat]);abline(v=0)
V(g)$label=topicnames
vertex.color = mycols[colcat]
vertex.label.cex = 0.8
vertex.label.color = "black"
edge.color = "gray60"
wts <- E(g)$weight
mylayout <- layout.fruchterman.reingold(g,weight=wts)
V(g)$size <- 12


##**FIGURE**
plot(g, layout=mylayout,edge.color=edge.color,vertex.color=vertex.color, vertex.label.cex=vertex.label.cex, vertex.label.color=vertex.label.color,edge.width=edge.width, ylim=c(-1,1),xlim=c(-1,1), asp = 0)


##**FIGURE**Figure 9
figure1 <- as.ggplot(~plot(ep4, "convicted", model=gendered_data, method="difference",cov.value1="1",cov.value2="0", 
                           xlab = "Not Convicted ... Convicted", ci.level = 0.95, verbose.labels = F, custom.labels=topicnames,labeltype="custom", xlim = c(-.17,.12),cex.lab = 1.5, cex.axis = 1.5, xaxs="i"))+ theme(plot.margin = unit(c(-4,-3,-7,-5), "lines"))

figure2 <- ggraph(g, layout = "fr") +
  geom_edge_link(aes(edge_alpha = edge.width, edge_width = edge.width), edge_colour = "gray", show.legend = FALSE) +
  geom_node_point(aes(fill = colcat), shape = 21, size = 8, color = "white", alpha=0.9, show.legend = FALSE) +
  theme_void() +
  geom_node_text(aes(label = label), repel = TRUE, colour="black") +
  scale_fill_gradient2(low = "blue",  mid = "darkgray", high = "brown", midpoint = 10)

my_plot_list <- list(figure1,figure2)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1)
ggarrange(plotlist = my_plot_list, ncol = 2,nrow=1,labels = c("A)", "B)"),font.label = list(size = 20))



####################################################################################################
####################################################################################################
####################################################################################################
data$female_complainant <- ifelse(grepl("female", data$complainant_gender), 1, 0)

##
set.seed(23456)
processed <- textProcessor(documents = data$text, metadata = data)

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta, lower.thresh = 500)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

##Thanks to Brandon Stewart

##########
K <- 32 
stm.out.c <- stm(out$documents, out$vocab, K=K, prevalence=~
                   female_complainant, content=~female_complainant,data=out$meta,
                   max.em.its=25, 
                   seed=1033311)

labelTopics(stm.out.c)
plot(stm.out.c, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Content Covariate")


## 
betaindex <- rep(2,length(stm.out.c$settings$covariates$betaindex))
beta <- list(beta=lapply(stm.out.c$beta$logbeta, exp))
if(!is.null(stm.out.c$beta$kappa)) beta$kappa <- stm.out.c$beta$kappa
sigma <- stm.out.c$sigma
lambda <- stm.out.c$eta
suffstats <- stm:::estep(documents=out$documents,
                           beta.index=betaindex,
                           update.mu=(!is.null(stm.out.c$mu$gamma)),
                           beta$beta,
                           lambda, stm.out.c$mu$mu,
                           sigma,
                           verbose=T)
lambda <- cbind(suffstats$lambda,0)
theta <- exp(lambda - stm:::row.lse(lambda))
rm(betaindex,beta,sigma,lambda, suffstats)

## 
library(devtools)
#install_github("bstewart/stm", dependencies=T)
library(stm)
library(stringr)
library(textir)
library(cem)
library(kernlab)
library(rbounds)
library(xtable)
library(MASS)
library(weights)
library(Zelig)
library(stargazer)


## Source the function for the stm projection
project <- function(mod, docs, interactions=FALSE, 
                    int.type=c("refit", "phi", "theta")) {
  int.type <- match.arg(int.type)
  t1 <- proc.time()
  if(!require(Matrix)) stop("Install the Matrix Package")  
  kappa <- mod$beta$kappa$params
  K <- mod$setting$dim$K
  A <- mod$setting$dim$A
  index <- sort(unique(mod$settings$covariates$betaindex))
  if(is.null(mod$settings$kappa$contrast)) mod$settings$kappa$contrast <- FALSE
  contrast <- mod$settings$kappa$contrast
  if(contrast) { 
    index <- index[-length(index)]  
  }
  
  proj <- vector(mode="list", length=length(index))
  for(i in 1:length(index)) {
    proj[[i]] <- unlist(lapply(docs, function(doc) {
      sum((kappa[[K+index[i]]])[doc[1,]]*doc[2,])/sum(doc[2,])
    }))
  }
  
  if(interactions & contrast) {
    if(int.type%in% c("refit", "phi")) stop("Only int.type='theta' works with contrast")
  }
  
  if(interactions & int.type=="refit") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    logbeta <- mod$beta$logbeta
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    
    for(d in 1:length(docs)) {
      for(a in 1:A) {
        expeta <- c(exp(mod$eta[d,]),1)
        
        
        EB <- expeta*exp(logbeta[[index[a]]][,docs[[d]][1,],drop=FALSE])
        EB <- t(EB)/colSums(EB) 
        
        
        phi <- t(EB*(docs[[d]][2,])) 
        
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*phi)
        
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }
  } 
  if(interactions & int.type=="phi") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    logbeta <- mod$beta$logbeta
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    bindex <- mod$settings$covariates$betaindex
    for(d in 1:length(docs)) {
      expeta <- c(exp(mod$eta[d,]),1)
      EB <- expeta*exp(logbeta[[bindex[d]]][,docs[[d]][1,],drop=FALSE]) 
      EB <- t(EB)/colSums(EB) 
      
      phi <- t(EB*(docs[[d]][2,])) 
      for(a in 1:length(index)) {
        
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*phi)
        
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }}
  if(interactions & int.type=="theta") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    for(d in 1:length(docs)) {
      theta <- mod$theta[d,]
      wct <- matrix(rep(docs[[d]][2,], each=K), nrow=K)
      theta.wct <- wct*theta
      for(a in 1:length(index)) {    
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*theta.wct)
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }}
  if(!interactions) {
    cat(sprintf("\n Calculation Complete in %.2f seconds",(proc.time() - t1)[3]))
    return(proj)
  } else {
    for(a in 1:length(index)) {
      diagnostic[[a]] <- diagnostic[[a]]/agg.wcounts[a]
    }
    return(list(projection=proj, diagnostic=diagnostic))
  }
}


##
stmproj.interact.theta <- project(stm.out.c, out$documents,interaction=T, int.type="theta")


#########
#########
#
doc.to.tdm <- function(documents, vocab){
  tdm <- matrix(0,nrow=length(documents),
                ncol=length(vocab))
  for(i in 1:length(documents)){
    tdm[i,documents[[i]][1,]] <-
      documents[[i]][2,]
  }
  return(tdm)
}
tdm <- doc.to.tdm(out$documents, out$vocab)

##
fit <- textir::mnlm(NULL,covars=cbind(out$meta$female_complainant),counts=tdm)


##############################
#
##############################
proj <- textir::srproj(fit, tdm)
out$meta$dmr1 <- proj[,1]
out$meta$dmr2 <- proj[,2]
out$meta$proj.interact1 <- stmproj.interact.theta$projection[[1]]
out$meta$proj.interact2 <- stmproj.interact.theta$projection[[2]]
#
out$meta[,paste("Topic", 1:K, sep="")] <- theta

##############################
##############################
breaks <- list()
for(i in 1:K){
  breaks[[paste("Topic", i, sep="")]] <- c(0,.1,1)
}

match.out <- cem::cem("female_complainant", out$meta,
                      drop=names(out$meta)[!names(out$meta)%in%c("female_complainant", paste("Topic", 1:K, sep=""))], 
                      cutpoints=breaks,  method="binary")

match.out
m.data <- out$meta[match.out$matched==T,]
m.data$strata <- match.out$strata[match.out$matched==T]
m.data$weights <- match.out$w[match.out$matched==T]
match.tdm.t <- tdm[out$meta$id%in%m.data$id,]

##############################
##############################
breaks <- list()
for(i in 1:K){
  breaks[[paste("Topic", i, sep="")]] <- c(0,.1,1)
}
breaks$proj.interact1 <- breaks$proj.interact2 <- 5

#
match.out.interact <- cem::cem("female_complainant", out$meta,
                               drop=names(out$meta)[!names(out$meta)%in%c("female_complainant", "proj.interact1","proj.interact2",paste("Topic", 1:K, sep=""))], 
                               cutpoints=breaks)

match.out.interact

m.data.interact <- out$meta[match.out.interact$matched==T,]
m.data.interact$strata <- match.out.interact$strata[match.out.interact$matched==T]
m.data.interact$weights <- match.out.interact$w[match.out.interact$matched==T]
match.tdm.interact <- tdm[out$meta$id%in%m.data.interact$id,]

## 
mvars <- c(paste("Topic", 1:K, sep=""), "urban")
labels <- sageLabels(stm.out.c, n=5)$marginal$prob
labels <- apply(labels,1, function (x) paste(x, collapse=", "))
##
tab <- cbind(apply(out$meta[out$meta$female_complainant==1,mvars],2,mean),
             apply(m.data.interact[m.data.interact$female_complainant==1,mvars],2,mean))
rownames(tab)[1:33] <- paste(rownames(tab)[1:33],labels)
tab <- cbind(tab,tab[,2]-tab[,1])
tab[order((tab[,3])),]

##############################
#############################
breaks <- list()
breaks$proj.interact1 <- breaks$proj.interact2 <- 5

#
match.out.dmr <- cem::cem("female_complainant", out$meta,
                          drop=names(out$meta)[!names(out$meta)%in%c("female_complainant","proj.interact1","proj.interact2")], 
                          cutpoints=breaks)

m.data.dmr <- out$meta[match.out.dmr$matched==T,]
m.data.dmr$strata <- match.out.dmr$strata[match.out.dmr$matched==T]
m.data.dmr$weights <- match.out.dmr$w[match.out.dmr$matched==T]
match.tdm.dmr <- tdm[out$meta$id%in%m.data.dmr$id,]

#######
######################################
topicsfemale_complainant.t <-
  apply(m.data[m.data$female_complainant==1,c(paste("Topic", 1:K, sep=""))],2, weighted.mean,
        w=m.data$weights[m.data$female_complainant==1])
topicsnfemale_complainant.t <-
  apply(m.data[m.data$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data$weights[m.data$female_complainant==0])
topics.diff.t <- topicsfemale_complainant.t-topicsnfemale_complainant.t

topicsfemale_complainant.interact <-
  apply(m.data.interact[m.data.interact$female_complainant==1,c(paste("Topic", 1:K, sep=""))],2, weighted.mean,
        w=m.data.interact$weights[m.data.interact$female_complainant==1])
topicsnfemale_complainant.interact <-
  apply(m.data.interact[m.data.interact$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.interact$weights[m.data.interact$female_complainant==0])
topics.diff.interact <- topicsfemale_complainant.interact-topicsnfemale_complainant.interact


topicsfemale_complainant.dmr <-
  apply(m.data.dmr[m.data.dmr$female_complainant==1,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.dmr$weights[m.data.dmr$female_complainant==1])
topicsnfemale_complainant.dmr <-
  apply(m.data.dmr[m.data.dmr$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.dmr$weights[m.data.dmr$female_complainant==0])
topics.diff.dmr <- topicsfemale_complainant.dmr-topicsnfemale_complainant.dmr

topicsfemale_complainant <-
  apply(out$meta[out$meta$female_complainant==1,c(paste("Topic",1:K, sep=""))],2,mean)
topicsnfemale_complainant <-
  apply(out$meta[out$meta$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,mean)
topics.diff <- topicsfemale_complainant-topicsnfemale_complainant

#
labels <- sageLabels(stm.out.c, n=5)$marginal$prob #
labels <- apply(labels,1, function (x) paste(x, collapse=", "))

##
pdf("delete3.pdf", width=10, height=7)
par(mar=c(7,1,.1,1))
plot(900,00, col="white", xlim=c(-.23,.09), ylim=c(0,K), yaxt="n",
     xlab="",
     ylab="", cex.lab=1.5, cex.axis=1.5,axes=F)
text(0,-3,"\n \n Mean topic difference (Women-Men)",cex=1.5,xpd=NA)
axis(1,at=seq(-.1,.1,.05),cex.lab=1.5, cex.axis=1.5)
for(i in (0:K)[0:K %% 2==0]){
  polygon(x=c(-1,1,1,-1),y=c(i,i,i+1,i+1),col="gray90",border=NA)
}
for(i in 1:K){
  polygon(x=c(0,sort(topics.diff)[i],sort(topics.diff)[i],0),
          y=c(i-1,i-1,i,i),col="gray70")
}
## 
points(x=topics.diff.dmr[names(sort(topics.diff))],y=0:(K-1)+.55, pch=1,cex=1.4)
points(x=topics.diff.t[names(sort(topics.diff))],y=0:(K-1)+.45, pch=2,cex=1.4)
points(x=topics.diff.interact[names(sort(topics.diff))],y=0:(K-1)+.65, pch=19,cex=1.65)
## 
names(labels) <- names(topics.diff)
text(rep(-.245,K),y=(1:K)-.5,paste("Topic", gsub("Topic","",names(labels[names(sort(topics.diff))])) , ": ",labels[names(sort(topics.diff))]),
     cex=1.2,pos=4, xpd=NA)
## 
legend(-.24,-.3,pch=c(22,19,1,2,0),pt.cex=c(3,1.5,1.2,1.2,1.2),pt.bg=c("gray70",NA,NA,NA,NA),
       legend=c("Full Data Set (Unmatched)","TIRM","Projection Matching","Topic Matching"),
       xpd=NA,bty="n",cex=1.2)
dev.off()#####Figure 10

#####

#############
#Matched Cases (Not Anonymized)
#############
#strat <- unique(m.data.interact$strata)
#set.seed(092)
#sampstrat <- sample(strat, 100)
#matches <- matrix(nrow=100, ncol=4)
#for(i in 1:length(sampstrat)){
  #stratdat <- m.data.interact[m.data.interact$strata==sampstrat[i],]
  #fem <- which(stratdat$female_complainant==1)
  #if(length(fem)==1) female <- stratdat[fem,]
  #if(length(fem)>1) female <- stratdat[sample(fem,1),]
  #fem <- which(stratdat$female_complainant==0)
  #if(length(fem)==1) male <- stratdat[fem,]
  #if(length(fem)>1) male <- stratdat[sample(fem,1),]
  #matches[i,1:2] <- c(female$translated, female$sections)#
  #matches[i,3:4] <- c(male$translated, male$sections)#
#}
## 
#matches <- matches[1:6,]
#sink("matched_text.txt")
#xtable::xtable(matches)
#sink()

#################
reg1 <- lm(convicted ~ female_complainant, data=m.data.interact, weights=weights)
reg2 <- lm(convicted ~ female_complainant + distance + io_rank + my_station + Month_Yr, data=m.data.interact, weights=weights)

reg3 <- lm(acquitted ~ female_complainant, data=m.data.interact, weights=weights)
reg4 <- lm(acquitted ~ female_complainant + distance + io_rank + my_station + Month_Yr, data=m.data.interact, weights=weights)

library(tidyr)
check <- m.data.interact %>% drop_na(judge_rank)

regression1 <- lm(convicted ~ female_complainant, data=check, weights=weights)
regression2 <- lm(convicted ~ female_complainant + distance + io_rank + my_station + Month_Yr + judge_rank, data=check, weights=weights)
regression3 <- lm(acquitted ~ female_complainant, data=check, weights=weights)
regression4 <- lm(acquitted ~ female_complainant + distance + io_rank  + judge_rank + my_station + Month_Yr, data=check, weights=weights)

##Table 7, top row
stargazer(reg1, reg2, reg3, reg4, type="text", align = FALSE,omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category", "urban"))
stargazer(regression1, regression2, regression3, regression4, type="text", align = FALSE,omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category", "urban"))
stargazer(reg1, reg2, reg3, reg4,regression1, regression2, regression3, regression4, type="text", align = FALSE,omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category", "urban"))
stargazer(reg1, reg2, reg3, reg4,regression1, regression2, regression3, regression4, align = FALSE,omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category", "urban"))
#stargazer(reg1, reg2, reg3, reg4,regression1, regression2, regression3, regression4, align = FALSE, out="filename.tex")
########################################################################################################################


#######
####################################################################################################
data <- data[with(data, grepl(0, data$gendered)),]
data$female_complainant <- ifelse(grepl("female", data$complainant_gender), 1, 0)

##
set.seed(23456)
processed <- textProcessor(documents = data$text, metadata = data)

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta, lower.thresh = 500)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

##########
K <- 32 
stm.out.c <- stm(out$documents, out$vocab, K=K, prevalence=~
                   female_complainant, content=~female_complainant,data=out$meta,
                 max.em.its=25, 
                 seed=1033311)

labelTopics(stm.out.c)
plot(stm.out.c, n=6, type = "summary", xlim = c(0, 0.35), main = "All Crime Content Covariate")

##
betaindex <- rep(2,length(stm.out.c$settings$covariates$betaindex))
beta <- list(beta=lapply(stm.out.c$beta$logbeta, exp))
if(!is.null(stm.out.c$beta$kappa)) beta$kappa <- stm.out.c$beta$kappa
sigma <- stm.out.c$sigma
lambda <- stm.out.c$eta
suffstats <- stm:::estep(documents=out$documents,
                         beta.index=betaindex,
                         update.mu=(!is.null(stm.out.c$mu$gamma)),
                         beta$beta,
                         lambda, stm.out.c$mu$mu,
                         sigma,
                         verbose=T)
lambda <- cbind(suffstats$lambda,0)
theta <- exp(lambda - stm:::row.lse(lambda))
rm(betaindex,beta,sigma,lambda, suffstats)

## 
library(devtools)
#install_github("bstewart/stm", dependencies=T)
library(stm)
library(stringr)
library(textir)
library(cem)
library(kernlab)
library(rbounds)
library(xtable)
library(MASS)
library(weights)
library(Zelig)
library(stargazer)

## 
project <- function(mod, docs, interactions=FALSE, 
                    int.type=c("refit", "phi", "theta")) {
  int.type <- match.arg(int.type)
  t1 <- proc.time()
  if(!require(Matrix)) stop("Install the Matrix Package")  
  kappa <- mod$beta$kappa$params
  K <- mod$setting$dim$K
  A <- mod$setting$dim$A
  index <- sort(unique(mod$settings$covariates$betaindex))
  if(is.null(mod$settings$kappa$contrast)) mod$settings$kappa$contrast <- FALSE
  contrast <- mod$settings$kappa$contrast
  if(contrast) { 
    index <- index[-length(index)]  
  }
  
  proj <- vector(mode="list", length=length(index))
  for(i in 1:length(index)) {
    proj[[i]] <- unlist(lapply(docs, function(doc) {
      sum((kappa[[K+index[i]]])[doc[1,]]*doc[2,])/sum(doc[2,])
    }))
  }
  
  if(interactions & contrast) {
    if(int.type%in% c("refit", "phi")) stop("Only int.type='theta' works with contrast")
  }
  
  if(interactions & int.type=="refit") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    logbeta <- mod$beta$logbeta
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    
    for(d in 1:length(docs)) {
      for(a in 1:A) {
        expeta <- c(exp(mod$eta[d,]),1)
        
        
        EB <- expeta*exp(logbeta[[index[a]]][,docs[[d]][1,],drop=FALSE])
        EB <- t(EB)/colSums(EB) 
        
        
        phi <- t(EB*(docs[[d]][2,])) 
        
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*phi)
        
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }
  } 
  if(interactions & int.type=="phi") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    logbeta <- mod$beta$logbeta
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    bindex <- mod$settings$covariates$betaindex
    for(d in 1:length(docs)) {
      expeta <- c(exp(mod$eta[d,]),1)
      EB <- expeta*exp(logbeta[[bindex[d]]][,docs[[d]][1,],drop=FALSE]) 
      EB <- t(EB)/colSums(EB) 
      
      phi <- t(EB*(docs[[d]][2,])) 
      for(a in 1:length(index)) {
        
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*phi)
        
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }}
  if(interactions & int.type=="theta") {
    cat("Starting Interactions...\n This may take a while...it prints a dot every 100 docs. \n")
    diagnostic <- vector(mode="list", length=length(index))
    for(i in 1:length(diagnostic)) {
      diagnostic[[i]] <- rep(0,mod$settings$dim$V)
    }
    agg.wcounts <- rep(0, length(index))
    for(d in 1:length(docs)) {
      theta <- mod$theta[d,]
      wct <- matrix(rep(docs[[d]][2,], each=K), nrow=K)
      theta.wct <- wct*theta
      for(a in 1:length(index)) {    
        kappamat <- Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
        kappamat <- kappamat[,docs[[d]][1,]]
        word.proj <- Matrix::colSums(kappamat*theta.wct)
        diagnostic[[a]][docs[[d]][1,]] <-diagnostic[[a]][docs[[d]][1,]] +  word.proj
        agg.wcounts[a] <- agg.wcounts[a] + sum(docs[[d]][2,])
        proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(docs[[d]][2,])
      }
      if(d%%100==0) cat(".")
    }}
  if(!interactions) {
    cat(sprintf("\n Calculation Complete in %.2f seconds",(proc.time() - t1)[3]))
    return(proj)
  } else {
    for(a in 1:length(index)) {
      diagnostic[[a]] <- diagnostic[[a]]/agg.wcounts[a]
    }
    return(list(projection=proj, diagnostic=diagnostic))
  }
}

##
stmproj.interact.theta <- project(stm.out.c, out$documents,interaction=T, int.type="theta")


#########
#########
doc.to.tdm <- function(documents, vocab){
  tdm <- matrix(0,nrow=length(documents),
                ncol=length(vocab))
  for(i in 1:length(documents)){
    tdm[i,documents[[i]][1,]] <-
      documents[[i]][2,]
  }
  return(tdm)
}
tdm <- doc.to.tdm(out$documents, out$vocab)

## 
fit <- textir::mnlm(NULL,covars=cbind(out$meta$female_complainant),counts=tdm)

##############################
##############################
proj <- textir::srproj(fit, tdm)
out$meta$dmr1 <- proj[,1]
out$meta$dmr2 <- proj[,2]
out$meta$proj.interact1 <- stmproj.interact.theta$projection[[1]]
out$meta$proj.interact2 <- stmproj.interact.theta$projection[[2]]
#
out$meta[,paste("Topic", 1:K, sep="")] <- theta

##############################
##############################
breaks <- list()
for(i in 1:K){
  breaks[[paste("Topic", i, sep="")]] <- c(0,.1,1)
}

match.out <- cem::cem("female_complainant", out$meta,
                      drop=names(out$meta)[!names(out$meta)%in%c("female_complainant", paste("Topic", 1:K, sep=""))], 
                      cutpoints=breaks,  method="binary")

match.out
m.data <- out$meta[match.out$matched==T,]
m.data$strata <- match.out$strata[match.out$matched==T]
m.data$weights <- match.out$w[match.out$matched==T]
match.tdm.t <- tdm[out$meta$id%in%m.data$id,]

##############################
##############################
breaks <- list()
for(i in 1:K){
  breaks[[paste("Topic", i, sep="")]] <- c(0,.1,1)
}
breaks$proj.interact1 <- breaks$proj.interact2 <- 5


##
match.out.interact <- cem::cem("female_complainant", out$meta,
                               drop=names(out$meta)[!names(out$meta)%in%c("female_complainant", "proj.interact1","proj.interact2",paste("Topic", 1:K, sep=""))], 
                               cutpoints=breaks)

match.out.interact

m.data.interact <- out$meta[match.out.interact$matched==T,]
m.data.interact$strata <- match.out.interact$strata[match.out.interact$matched==T]
m.data.interact$weights <- match.out.interact$w[match.out.interact$matched==T]
match.tdm.interact <- tdm[out$meta$id%in%m.data.interact$id,]

## 
mvars <- c(paste("Topic", 1:K, sep=""), "urban")
labels <- sageLabels(stm.out.c, n=5)$marginal$prob
labels <- apply(labels,1, function (x) paste(x, collapse=", "))
##
tab <- cbind(apply(out$meta[out$meta$female_complainant==1,mvars],2,mean),
             apply(m.data.interact[m.data.interact$female_complainant==1,mvars],2,mean))
rownames(tab)[1:33] <- paste(rownames(tab)[1:33],labels)
tab <- cbind(tab,tab[,2]-tab[,1])
tab[order((tab[,3])),]

##############################
#############################
breaks <- list()
breaks$proj.interact1 <- breaks$proj.interact2 <- 5

#
match.out.dmr <- cem::cem("female_complainant", out$meta,
                          drop=names(out$meta)[!names(out$meta)%in%c("female_complainant","proj.interact1","proj.interact2")], 
                          cutpoints=breaks)

m.data.dmr <- out$meta[match.out.dmr$matched==T,]
m.data.dmr$strata <- match.out.dmr$strata[match.out.dmr$matched==T]
m.data.dmr$weights <- match.out.dmr$w[match.out.dmr$matched==T]
match.tdm.dmr <- tdm[out$meta$id%in%m.data.dmr$id,]

#######
############################
######################################
topicsfemale_complainant.t <-
  apply(m.data[m.data$female_complainant==1,c(paste("Topic", 1:K, sep=""))],2, weighted.mean,
        w=m.data$weights[m.data$female_complainant==1])
topicsnfemale_complainant.t <-
  apply(m.data[m.data$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data$weights[m.data$female_complainant==0])
topics.diff.t <- topicsfemale_complainant.t-topicsnfemale_complainant.t

topicsfemale_complainant.interact <-
  apply(m.data.interact[m.data.interact$female_complainant==1,c(paste("Topic", 1:K, sep=""))],2, weighted.mean,
        w=m.data.interact$weights[m.data.interact$female_complainant==1])
topicsnfemale_complainant.interact <-
  apply(m.data.interact[m.data.interact$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.interact$weights[m.data.interact$female_complainant==0])
topics.diff.interact <- topicsfemale_complainant.interact-topicsnfemale_complainant.interact


topicsfemale_complainant.dmr <-
  apply(m.data.dmr[m.data.dmr$female_complainant==1,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.dmr$weights[m.data.dmr$female_complainant==1])
topicsnfemale_complainant.dmr <-
  apply(m.data.dmr[m.data.dmr$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,weighted.mean,
        w=m.data.dmr$weights[m.data.dmr$female_complainant==0])
topics.diff.dmr <- topicsfemale_complainant.dmr-topicsnfemale_complainant.dmr

topicsfemale_complainant <-
  apply(out$meta[out$meta$female_complainant==1,c(paste("Topic",1:K, sep=""))],2,mean)
topicsnfemale_complainant <-
  apply(out$meta[out$meta$female_complainant==0,c(paste("Topic",1:K, sep=""))],2,mean)
topics.diff <- topicsfemale_complainant-topicsnfemale_complainant

#
labels <- sageLabels(stm.out.c, n=5)$marginal$prob 
labels <- apply(labels,1, function (x) paste(x, collapse=", "))

##
pdf("delete2.pdf", width=10, height=7)
par(mar=c(7,1,.1,1))
plot(900,00, col="white", xlim=c(-.23,.09), ylim=c(0,K), yaxt="n",
     xlab="",
     ylab="", cex.lab=1.5, cex.axis=1.5,axes=F)
text(0,-3,"\n \n Mean topic difference (Women-Men)",cex=1.5,xpd=NA)
axis(1,at=seq(-.1,.1,.05),cex.lab=1.5, cex.axis=1.5)
for(i in (0:K)[0:K %% 2==0]){
  polygon(x=c(-1,1,1,-1),y=c(i,i,i+1,i+1),col="gray90",border=NA)
}
for(i in 1:K){
  polygon(x=c(0,sort(topics.diff)[i],sort(topics.diff)[i],0),
          y=c(i-1,i-1,i,i),col="gray70")
}
##
points(x=topics.diff.dmr[names(sort(topics.diff))],y=0:(K-1)+.55, pch=1,cex=1.4)
points(x=topics.diff.t[names(sort(topics.diff))],y=0:(K-1)+.45, pch=2,cex=1.4)
points(x=topics.diff.interact[names(sort(topics.diff))],y=0:(K-1)+.65, pch=19,cex=1.65)
##
names(labels) <- names(topics.diff)
text(rep(-.245,K),y=(1:K)-.5,paste("Topic", gsub("Topic","",names(labels[names(sort(topics.diff))])) , ": ",labels[names(sort(topics.diff))]),
     cex=1.2,pos=4, xpd=NA)
##
legend(-.24,-.3,pch=c(22,19,1,2,0),pt.cex=c(3,1.5,1.2,1.2,1.2),pt.bg=c("gray70",NA,NA,NA,NA),
       legend=c("Full Data Set (Unmatched)","TIRM","Projection Matching","Topic Matching"),
       xpd=NA,bty="n",cex=1.2)
dev.off()#Figure A45

##
rreg1 <- lm(convicted ~ female_complainant, data=m.data.interact, weights=weights)
rreg2 <- lm(convicted ~ female_complainant + distance + io_rank + my_station + Month_Yr, data=m.data.interact, weights=weights)
rreg3 <- lm(acquitted ~ female_complainant, data=m.data.interact, weights=weights)
rreg4 <- lm(acquitted ~ female_complainant + distance + io_rank + my_station + Month_Yr, data=m.data.interact, weights=weights)

##
check <- m.data.interact %>% drop_na(judge_rank)

rreg5 <- lm(convicted ~ female_complainant, data=check, weights=weights)
rreg6 <- lm(convicted ~ female_complainant + distance + io_rank + my_station + Month_Yr + judge_rank, data=check, weights=weights)
rreg7 <- lm(acquitted ~ female_complainant, data=check, weights=weights)
rreg8 <- lm(acquitted ~ female_complainant + distance + io_rank  + judge_rank + my_station + Month_Yr, data=check, weights=weights)

#Table 7, Bottom Row
stargazer(rreg1, rreg2, rreg3, rreg4, rreg5, rreg6, rreg7, rreg8, align = FALSE,type="text", omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category","urban"))
stargazer(rreg1, rreg2, rreg3, rreg4, rreg5, rreg6, rreg7, rreg8, align = FALSE,omit.stat=c("ser","f"), omit = c("my_station","Month_Yr","io_rank", "judge_rank", "distance", "ps_category","urban"))
stargazer(rreg1, rreg2, rreg3, rreg4, rreg5, rreg6, rreg7, rreg8)
#stargazer(rreg1, rreg2, rreg3, rreg4, rreg5, rreg6, rreg7, rreg8, align = FALSE,omit.stat=c("ser","f"), out="filename.tex")









